import pandas as pd
df = pd.read_excel('C:\\Users\\halil\\OneDrive\\Masaüstü\\babynames.xlsx')
df.head() #ilk 5 satır
| Year of Birth | Gender | Ethnicity | Child's First Name | Count | Rank | |
|---|---|---|---|---|---|---|
| 0 | 2011 | FEMALE | HISPANIC | GERALDINE | 13 | 75 |
| 1 | 2011 | FEMALE | HISPANIC | GIA | 21 | 67 |
| 2 | 2011 | FEMALE | HISPANIC | GIANNA | 49 | 42 |
| 3 | 2011 | FEMALE | HISPANIC | GISELLE | 38 | 51 |
| 4 | 2011 | FEMALE | HISPANIC | GRACE | 36 | 53 |
df.columns #verideki sütunlarun isimleri.
Index(['Year of Birth', 'Gender', 'Ethnicity', 'Child's First Name', 'Count',
'Rank'],
dtype='object')
df.shape #satır ve sütunların sayısı.
(57582, 6)
df.dtypes #sütunların veri tipleri
Year of Birth int64 Gender object Ethnicity object Child's First Name object Count int64 Rank int64 dtype: object
df.isnull().sum()
Year of Birth 0 Gender 0 Ethnicity 0 Child's First Name 0 Count 0 Rank 0 dtype: int64
import seaborn as sns
sns.set_theme()
sns.set(rc={"figure.dpi":300,"figure.figsize":(12,9)})
sns.heatmap(df.isnull(),cbar=False) #eksik verileri gösteriyor. Verisetinde eksik bir değerimiz yok. Eğer eksik veriler olsaydı
# yapmamız gereken iki işlem olurdu ya eksik verileri direkt silebilirdik. ya da eksik verilerin yerine o sütunun ortalamasını girebilirdik.
#verileri silebilmek için o sütundaki eksik veri değerinin çok az olması gerekir ki sonucu etkilemesin.
#dd.dropna(inplace=True) bu kod ile eksik verileri silebiliriz.
#df["count"].fillna(count_median, inplace=True) bu kod ile de eksik verimiz çok olan sütunun değerlerini değiştirip kaydedebiliriz.
<Axes: >
count_median=df["Count"].median()
count_median #count sütununun median değerini bulduk.
20.0
df.isnull().sum().sum() #verisetinde eksik bir değer olup olmadığını görmek için kullanıyoruz.
0
#eğer verisetimizde bulunan değerler doğru tipe sahip olmasaydı onları da değiştirmemiz gerekirdi
#object tipinden int tipine değiştirmek için
#df["Count"]=df["Count"].astype("int64") metodunu kullanmamız gerekirdi.
df["Count"].describe().round()
count 57582.0 mean 34.0 std 39.0 min 10.0 25% 13.0 50% 20.0 75% 36.0 max 426.0 Name: Count, dtype: float64
import matplotlib.pyplot as plt
plt.figure(figsize=(8, 6))
plt.title("The distribution of babies according to ethnic origins - BAR")
df["Ethnicity"].value_counts().plot(kind="bar", color="yellow")
plt.show()
import matplotlib.pyplot as plt
plt.title("The distribution of babies according to ethnic origins PieChart")
df["Ethnicity"].value_counts().plot(kind="pie",autopct='%1.1f%%',colors=["yellow","green","black","white","blue","purple","gray"])
<Axes: title={'center': 'The distribution of babies according to ethnic origins PieChart'}, ylabel='count'>
import matplotlib.pyplot as plt
plt.title("The distribution of babies according to ethnic origins ScatterPlot")
df.reset_index().plot(kind='scatter', x='Count', y='Ethnicity', color='yellow')
<Axes: xlabel='Count', ylabel='Ethnicity'>
most_common_name_2011 = df[df['Year of Birth'] == 2011].nlargest(1, 'Count')
count_of_most_common_name_2011 = most_common_name_2011['Count'].values[0]
print(f"2011 yılında en çok konulan ilk isim: {most_common_name_2011.iloc[0, 1]}")
print(f"Kaç kere konuldu: {count_of_most_common_name_2011}")
2011 yılında en çok konulan ilk isim: MALE Kaç kere konuldu: 426
most_common_name_2011 = df[df['Year of Birth'] == 2011].nlargest(1, 'Count')
most_common_name = most_common_name_2011["Child's First Name"].values[0]
count_of_most_common_name = most_common_name_2011['Count'].values[0]
# Sonucu göster
print(f"2011 yılında en çok konulan ilk isim: {most_common_name}")
print(f"Kaç kere konuldu: {count_of_most_common_name}")
2011 yılında en çok konulan ilk isim: JAYDEN Kaç kere konuldu: 426
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
total_counts = df.groupby('Gender')['Count'].sum()
labels = total_counts.index
sizes = total_counts.values
colors = ['pink' if label == 'FEMALE' else 'blue' for label in labels]
plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
plt.title('Distribution of Total Counts by Gender')
plt.show()
fig = go.Figure(data=[go.Pie(labels=labels, values=sizes, marker=dict(colors=colors))])
fig.update_layout(title='Distribution of Total Counts by Gender')
fig.show()
total_counts = df.groupby('Gender')['Count'].sum()
colors = ['pink' if label == 'FEMALE' else 'blue' for label in total_counts.index]
total_counts.plot(kind='bar', color=colors)
plt.xlabel('Gender')
plt.ylabel('Total Count')
plt.title('Total Counts by Gender')
plt.xticks(rotation=0)
plt.show()
df["Year of Birth"].value_counts().plot(kind="bar",color="green")
#yıllara göre doğan çocuk sayısı
<Axes: xlabel='Year of Birth'>
import matplotlib.pyplot as plt
plt.title("Birth rate by year")
df["Year of Birth"].value_counts().plot(kind="pie",autopct='%1.1f%%',colors=["yellow","green","black","pink","blue","purple","gray","orange","brown"])
<Axes: title={'center': 'Birth rate by year'}, ylabel='count'>
eth_gender_counts = df.groupby(['Ethnicity', 'Gender']).size().reset_index(name='Count')
sns.barplot(x='Count', y='Ethnicity', hue='Gender', data=eth_gender_counts)
plt.xlabel('Etnik Köken')
plt.ylabel('Sayı')
plt.title('Number of Girls and Boys Babies by Ethnicity')
plt.show()
plt.pie(eth_gender_counts.groupby('Ethnicity')['Count'].sum(), labels=eth_gender_counts['Ethnicity'].unique(), autopct='%1.1f%%', startangle=90)
plt.title('Rate of Girls and Boys Babies by Ethnicity')
plt.show()
eth_gender_counts_pivot = eth_gender_counts.pivot_table(index='Ethnicity', columns='Gender', values='Count', aggfunc='sum')
eth_gender_counts_pivot.plot(kind='bar', stacked=True)
plt.xlabel('Etnik Köken')
plt.ylabel('Sayı')
plt.title('Number of Girls and Boys Babies by Ethnicity (Stacked Bar Plot)')
plt.show()
import pandas as pd
import plotly.express as px
eth_gender_counts = df.groupby(['Ethnicity', 'Gender'])['Count'].sum().reset_index()
eth_gender_counts_pivot = eth_gender_counts.pivot_table(index='Ethnicity', columns='Gender', values='Count', aggfunc='sum')
fig = px.bar(eth_gender_counts_pivot, barmode='stack',
labels={'Count': 'Number'},
title='Number of Girls and Boys Babies by Ethnicity (Stacked Bar Plot)',
height=400)
fig.update_layout(xaxis_title='Ethnicity', yaxis_title='Number')
fig.show()
data_2011 = df[df['Year of Birth'] == 2011]
female_children = data_2011[data_2011['Gender'] == 'Female']
male_children = data_2011[data_2011['Gender'] == 'Male']
eth_gender_counts = data_2011.groupby(['Ethnicity', 'Gender'])['Count'].sum().reset_index()
plt.figure(figsize=(12, 6))
sns.barplot(x='Ethnicity', y='Count', hue='Gender', data=eth_gender_counts, errorbar=None)
plt.xlabel('Ethnicity')
plt.ylabel('Total Number of Children')
plt.title('Distribution of Girls and Boys by Ethnicity in the Year 2011')
plt.show()
import pandas as pd
import seaborn as sns
import plotly.express as px
data_2011 = df[df['Year of Birth'] == 2011]
eth_gender_counts = data_2011.groupby(['Ethnicity', 'Gender'])['Count'].sum().reset_index()
fig = px.bar(eth_gender_counts, x='Ethnicity', y='Count', color='Gender',
labels={'Count': 'Total Number of Children'},
title='Distribution of Girls and Boys by Ethnicity in the Year 2011',
height=400)
fig.show()